library(tidyverse)
library(here)
library(hrbrthemes)
library(janitor)
library(corrplot)
RNGkind(sample.kind = "Rounding")
set.seed(1)
theme_set(theme_ipsum())
credit <- as_tibble(read_csv(here("data", "creditcard.csv")))
head(credit)
anyNA(credit)
[1] FALSE
skim(credit)
── Data Summary ────────────────────────
Values
Name credit
Number of rows 284807
Number of columns 31
_______________________
Column type frequency:
factor 1
numeric 30
________________________
Group variables None
── Variable type: factor ────────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate ordered n_unique top_counts
1 Class 0 1 FALSE 2 Non: 284315, Fra: 492
── Variable type: numeric ───────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
1 Time 0 1 9.48e+ 4 47488. 0 54202. 84692 139320. 172792 ▃▇▅▆▇
2 V1 0 1 1.17e-15 1.96 -56.4 -0.920 0.0181 1.32 2.45 ▁▁▁▁▇
3 V2 0 1 3.12e-16 1.65 -72.7 -0.599 0.0655 0.804 22.1 ▁▁▁▇▁
4 V3 0 1 -1.36e-15 1.52 -48.3 -0.890 0.180 1.03 9.38 ▁▁▁▁▇
5 V4 0 1 2.11e-15 1.42 -5.68 -0.849 -0.0198 0.743 16.9 ▂▇▁▁▁
6 V5 0 1 9.80e-16 1.38 -114. -0.692 -0.0543 0.612 34.8 ▁▁▁▇▁
7 V6 0 1 1.51e-15 1.33 -26.2 -0.768 -0.274 0.399 73.3 ▁▇▁▁▁
8 V7 0 1 -5.42e-16 1.24 -43.6 -0.554 0.0401 0.570 121. ▁▇▁▁▁
9 V8 0 1 1.03e-16 1.19 -73.2 -0.209 0.0224 0.327 20.0 ▁▁▁▇▁
10 V9 0 1 -2.42e-15 1.10 -13.4 -0.643 -0.0514 0.597 15.6 ▁▁▇▁▁
11 V10 0 1 2.23e-15 1.09 -24.6 -0.535 -0.0929 0.454 23.7 ▁▁▇▁▁
12 V11 0 1 1.71e-15 1.02 -4.80 -0.762 -0.0328 0.740 12.0 ▁▇▁▁▁
13 V12 0 1 -1.24e-15 0.999 -18.7 -0.406 0.140 0.618 7.85 ▁▁▁▇▁
14 V13 0 1 8.35e-16 0.995 -5.79 -0.649 -0.0136 0.663 7.13 ▁▃▇▁▁
15 V14 0 1 1.23e-15 0.959 -19.2 -0.426 0.0506 0.493 10.5 ▁▁▁▇▁
16 V15 0 1 4.84e-15 0.915 -4.50 -0.583 0.0481 0.649 8.88 ▁▇▂▁▁
17 V16 0 1 1.43e-15 0.876 -14.1 -0.468 0.0664 0.523 17.3 ▁▁▇▁▁
18 V17 0 1 -3.78e-16 0.849 -25.2 -0.484 -0.0657 0.400 9.25 ▁▁▁▇▁
19 V18 0 1 9.76e-16 0.838 -9.50 -0.499 -0.00364 0.501 5.04 ▁▁▂▇▁
20 V19 0 1 1.04e-15 0.814 -7.21 -0.456 0.00373 0.459 5.59 ▁▁▇▂▁
21 V20 0 1 6.41e-16 0.771 -54.5 -0.212 -0.0625 0.133 39.4 ▁▁▇▁▁
22 V21 0 1 1.69e-16 0.735 -34.8 -0.228 -0.0295 0.186 27.2 ▁▁▇▁▁
23 V22 0 1 -3.38e-16 0.726 -10.9 -0.542 0.00678 0.529 10.5 ▁▁▇▁▁
24 V23 0 1 2.67e-16 0.624 -44.8 -0.162 -0.0112 0.148 22.5 ▁▁▁▇▁
25 V24 0 1 4.47e-15 0.606 -2.84 -0.355 0.0410 0.440 4.58 ▁▇▆▁▁
26 V25 0 1 5.11e-16 0.521 -10.3 -0.317 0.0166 0.351 7.52 ▁▁▇▂▁
27 V26 0 1 1.68e-15 0.482 -2.60 -0.327 -0.0521 0.241 3.52 ▁▆▇▁▁
28 V27 0 1 -3.67e-16 0.404 -22.6 -0.0708 0.00134 0.0910 31.6 ▁▁▇▁▁
29 V28 0 1 -1.23e-16 0.330 -15.4 -0.0530 0.0112 0.0783 33.8 ▁▇▁▁▁
30 log_amount 0 1 3.15e+ 0 1.66 0 1.89 3.14 4.36 10.2 ▅▇▅▁▁
for (i in 1:ncol(credit)){
print(summary(credit[, i]))
}
Time
Min. : 0
1st Qu.: 54202
Median : 84692
Mean : 94814
3rd Qu.:139320
Max. :172792
V1
Min. :-56.40751
1st Qu.: -0.92037
Median : 0.01811
Mean : 0.00000
3rd Qu.: 1.31564
Max. : 2.45493
V2
Min. :-72.71573
1st Qu.: -0.59855
Median : 0.06549
Mean : 0.00000
3rd Qu.: 0.80372
Max. : 22.05773
V3
Min. :-48.3256
1st Qu.: -0.8904
Median : 0.1799
Mean : 0.0000
3rd Qu.: 1.0272
Max. : 9.3826
V4
Min. :-5.68317
1st Qu.:-0.84864
Median :-0.01985
Mean : 0.00000
3rd Qu.: 0.74334
Max. :16.87534
V5
Min. :-113.74331
1st Qu.: -0.69160
Median : -0.05434
Mean : 0.00000
3rd Qu.: 0.61193
Max. : 34.80167
V6
Min. :-26.1605
1st Qu.: -0.7683
Median : -0.2742
Mean : 0.0000
3rd Qu.: 0.3986
Max. : 73.3016
V7
Min. :-43.5572
1st Qu.: -0.5541
Median : 0.0401
Mean : 0.0000
3rd Qu.: 0.5704
Max. :120.5895
V8
Min. :-73.21672
1st Qu.: -0.20863
Median : 0.02236
Mean : 0.00000
3rd Qu.: 0.32735
Max. : 20.00721
V9
Min. :-13.43407
1st Qu.: -0.64310
Median : -0.05143
Mean : 0.00000
3rd Qu.: 0.59714
Max. : 15.59500
V10
Min. :-24.58826
1st Qu.: -0.53543
Median : -0.09292
Mean : 0.00000
3rd Qu.: 0.45392
Max. : 23.74514
V11
Min. :-4.79747
1st Qu.:-0.76249
Median :-0.03276
Mean : 0.00000
3rd Qu.: 0.73959
Max. :12.01891
V12
Min. :-18.6837
1st Qu.: -0.4056
Median : 0.1400
Mean : 0.0000
3rd Qu.: 0.6182
Max. : 7.8484
V13
Min. :-5.79188
1st Qu.:-0.64854
Median :-0.01357
Mean : 0.00000
3rd Qu.: 0.66251
Max. : 7.12688
V14
Min. :-19.2143
1st Qu.: -0.4256
Median : 0.0506
Mean : 0.0000
3rd Qu.: 0.4931
Max. : 10.5268
V15
Min. :-4.49894
1st Qu.:-0.58288
Median : 0.04807
Mean : 0.00000
3rd Qu.: 0.64882
Max. : 8.87774
V16
Min. :-14.12985
1st Qu.: -0.46804
Median : 0.06641
Mean : 0.00000
3rd Qu.: 0.52330
Max. : 17.31511
V17
Min. :-25.16280
1st Qu.: -0.48375
Median : -0.06568
Mean : 0.00000
3rd Qu.: 0.39968
Max. : 9.25353
V18
Min. :-9.498746
1st Qu.:-0.498850
Median :-0.003636
Mean : 0.000000
3rd Qu.: 0.500807
Max. : 5.041069
V19
Min. :-7.213527
1st Qu.:-0.456299
Median : 0.003735
Mean : 0.000000
3rd Qu.: 0.458949
Max. : 5.591971
V20
Min. :-54.49772
1st Qu.: -0.21172
Median : -0.06248
Mean : 0.00000
3rd Qu.: 0.13304
Max. : 39.42090
V21
Min. :-34.83038
1st Qu.: -0.22839
Median : -0.02945
Mean : 0.00000
3rd Qu.: 0.18638
Max. : 27.20284
V22
Min. :-10.933144
1st Qu.: -0.542350
Median : 0.006782
Mean : 0.000000
3rd Qu.: 0.528554
Max. : 10.503090
V23
Min. :-44.80774
1st Qu.: -0.16185
Median : -0.01119
Mean : 0.00000
3rd Qu.: 0.14764
Max. : 22.52841
V24
Min. :-2.83663
1st Qu.:-0.35459
Median : 0.04098
Mean : 0.00000
3rd Qu.: 0.43953
Max. : 4.58455
V25
Min. :-10.29540
1st Qu.: -0.31715
Median : 0.01659
Mean : 0.00000
3rd Qu.: 0.35072
Max. : 7.51959
V26
Min. :-2.60455
1st Qu.:-0.32698
Median :-0.05214
Mean : 0.00000
3rd Qu.: 0.24095
Max. : 3.51735
V27
Min. :-22.565679
1st Qu.: -0.070840
Median : 0.001342
Mean : 0.000000
3rd Qu.: 0.091045
Max. : 31.612198
V28
Min. :-15.43008
1st Qu.: -0.05296
Median : 0.01124
Mean : 0.00000
3rd Qu.: 0.07828
Max. : 33.84781
Amount
Min. : 0.00
1st Qu.: 5.60
Median : 22.00
Mean : 88.35
3rd Qu.: 77.17
Max. :25691.16
Class
Min. :0.000000
1st Qu.:0.000000
Median :0.000000
Mean :0.001728
3rd Qu.:0.000000
Max. :1.000000
for (i in names(credit[, -31])) {
p <- ggplot(credit, aes_string(x = i)) +
geom_density(fill = "cornsilk")
print(p)
}
ggplot(data = credit, aes(x = Time, fill = Class)) +
geom_histogram() +
facet_wrap(~Class, scales = "free")
ggplot(data = credit, aes(x = log(Amount), fill = Class)) +
geom_histogram() +
facet_wrap(~Class, scales = "free")
ggplot(data = credit, aes(x = Time, y = log(Amount), alpha = 0.2)) +
geom_point() +
facet_wrap(~Class, scales = "free")
There is a clear skew to the Amount variable, so it is worth applying a transformation to the data. As there are values of 0, we need to add 1 to ensure that we don’t get Inf values produced after log transformation.
credit <- credit %>%
mutate(log_amount = log(Amount + 1))
summary(credit$log_amount)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 1.887 3.135 3.152 4.359 10.154
ggplot(credit, aes(x = log_amount)) +
geom_density(fill = "cornsilk")
corrplot(cor(credit[, -31]), method = "square", type = "upper")
Transformation of Amount has helped to reduce collinearities of the predictors. The only correlations are between V3 and Time, and V2 and log_amount. This is as expected as PCA produces orthogonal linear combinations, therefore there shouldn’t be much correlation between them.